1 Assumption

  1. Organic leads are considered as organic regardless of possible prior ads exposure. We do not have data on ads impression to attribute how much of organic data is from ads exposure or are they truly organic.
  2. We eliminated calls has dial time shorter than 30 second to standardize $10 for a phonecall.
  3. In lead table, all the interactions having LeadVendor/LeadType unknown and NAs under ‘Paid’ category, were regrouped into ‘Unspecified’. All miscellaneous leads (i.e., brandnetworks, MSN_Real_Estate) are regrouped under ‘Paid Other’. They might have some valuable information to understand the interaction prior conversion, yet we don’t have information for such data.
  4. Timezone is consistent. Even though the data is pulled from different sources, we assume all datetime recorded is at a same time zone.
  5. In lead table, Twitter, Facebook, Instagram, Linkedin leads are regrouped into ‘Paid Social’.
  6. In lead table, ‘Organic Social’ is considered as ‘Organic’ and was grouped together.
  7. In call table, missing REAgentID might be due to human errors and were eliminated because AgentID is critial for analysis.
  8. We assume all NPV is the same for other attribution model (even though they were given for last-touch model). We also used averages; more granular NPV data for call and meeting channels were unavailable.

2 Data

2.1 Clear working enviroment

# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)

2.2 Load Packages

if (!require("ChannelAttribution")) install.packages("ChannelAttribution")
if (!require("plotly")) install.packages("plotly")
if (!require("jsonlite")) install.packages("jsonlite")
if (!require("reshape")) install.packages("reshape")
library(jsonlite)
library(plotly)
library(readr)
library(tidyverse)
library(ChannelAttribution)
library(ggplot2)
library(reshape)

2.3 Load Data

#Sales call Dataframes
PhoneCalls20161001_20170228 <- read_csv("PhoneCalls20161001_20170228.csv") #From 2016-10-01 to 2017-02-28
PhoneCalls20170301_20170630 <- read_csv("PhoneCalls20170301_20170630.csv") #From 2017-03-01 to 2017-06-30

#Agent acquisition date by ZUID
agent_acquisition_date <- read_csv("AgentAcquisitionDates.csv")

#AgentID to ZUID mapping
agentid_zuid <- read_csv("AgentIDZUIDLookup.csv")

#Lead table
agent_lead <- read_csv("AgentLeads.csv")

#Sales meeting table
sales_meeting <- read_csv("SalesMeetings.csv")

#Union 2 Call Dataframes. Output: Call Dataframe from 2016-10-01 to 2017-06-30
phone_call <- bind_rows(PhoneCalls20161001_20170228, PhoneCalls20170301_20170630)

3 Data Preprocessing

3.1 Identify missing values

#Function returning the percentage of missing values for each column.
function_count_na <- function(df){
  percent_missing <-sapply(df, function(y) round(100*sum(length(which(is.na(y))))/sum(length(y)),2))
  percent_missing <- data.frame(percent_missing)
  percent_missing
}

function_count_na(phone_call) #REAgentID missing 69.6%, SalesRepID missing 0.5%, 3 other variables no missing data 
##                   percent_missing
## REAgentID                   69.56
## SalesRepID                   0.54
## PhoneCallType                0.00
## PhoneCallDateTime            0.00
## TalkTimeMinutes              0.00
function_count_na(agent_lead) 
##              percent_missing
## REAgentID               5.68
## SalesRepID             14.87
## LeadPlatform            0.00
## LeadType               24.52
## LeadVendor             68.18
## LeadDateTime            0.00
function_count_na(sales_meeting) #no missing data 
##                  percent_missing
## REAgentID                      0
## SalesRepID                     0
## SalesMeetingDate               0
function_count_na(agentid_zuid) #no missing data 
##           percent_missing
## REAgentID               0
## ZUID                    0
function_count_na(agent_acquisition_date) #Zuid missing 0%, AcquisitionDate missing 14.5%
##                 percent_missing
## ZUID                       0.00
## AcquisitionDate           14.54

3.2 Check Datetime range

#Check date range of agent_lead
max(agent_lead$LeadDateTime) #"2017-06-29 23:30:28 UTC"
## [1] "2017-06-29 23:30:28 UTC"
min(agent_lead$LeadDateTime) #"2016-10-01 00:00:49 UTC"
## [1] "2016-10-01 00:00:49 UTC"
#Check date range of agent_acquisition_date
max(na.omit(agent_acquisition_date$AcquisitionDate)) #"2017-07-05 UTC"
## [1] "2017-07-05 UTC"
min(na.omit(agent_acquisition_date$AcquisitionDate)) #"2009-01-01 UTC"
## [1] "2009-01-01 UTC"
#Check date range of sales_meeting
max(sales_meeting$SalesMeetingDate) #"2017-06-30 UTC"
## [1] "2017-06-30 UTC"
min(sales_meeting$SalesMeetingDate) #"2017-03-13 UTC"
## [1] "2017-03-13 UTC"

Observation

Lead table: * From Oct 2017 to June 2017

Acquisition table: * From Jan 2009 to July 2017 * Solution: Delete data before Oct 2017

Sales Meeting table: * Only from March 2017 to June 2017 * Comment: Limited information, might not be information for data before march 2017

#Filter agent_acquisition_date before 2016-10-01
agent_acquisition_date <- subset(agent_acquisition_date, AcquisitionDate >= "2016-10-01")

Result: 18,086 rows (originally 161,006 rows )

3.3 Organize and assign Channels

#Search for all unique lead vendors
agent_lead %>% group_by(agent_lead$LeadType) %>% distinct(LeadVendor)
## # A tibble: 88 x 2
## # Groups:   agent_lead$LeadType [6]
##    LeadVendor    `agent_lead$LeadType`
##    <chr>         <chr>                
##  1 <NA>          <NA>                 
##  2 email         Email                
##  3 Affiliate     Paid                 
##  4 <NA>          Organic              
##  5 Facebook      Paid                 
##  6 facebook      Paid                 
##  7 Gmail         Paid                 
##  8 Google Search Paid                 
##  9 Gdn           Paid                 
## 10 email_welcome Email                
## # … with 78 more rows
#Clean up marketing channel assignments
agent_lead <-agent_lead %>% mutate(LeadType=replace(LeadType, LeadType== "Social Organic", "Organic"),
                      LeadVendor=replace(LeadVendor, LeadType == "Organic", "Organic"),
                      LeadVendor=replace(LeadVendor, LeadVendor == "organic", "Organic"),
                      LeadVendor=replace(LeadVendor, LeadType == "Organic", "Organic"),
                      LeadVendor=replace(LeadVendor, LeadType == "unknown", "Unspecified"),
                      LeadVendor=replace(LeadVendor, LeadVendor == "unknown", "Unspecified"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Gdn", "Google Search"),
            LeadVendor=replace(LeadVendor, LeadVendor== "GDN", "Google Search"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Affiliate", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Linkedin Ads", "Linkedin"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Linkedin Display", "Linkedin"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Great_Schools", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Trulia", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "gemini", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "internal", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "IronTraffic", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "4197532", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "brandnetworks", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "MSN_Real_Estate", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "Unspecified" & LeadType == "Paid", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "http://www.2020propertygroup.com/monthly-payment-calculator/", "Paid Other"),
            LeadVendor=replace(LeadVendor, is.na(LeadVendor) & LeadType == "Paid", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "AreaVibes", "Paid Other"),
            LeadType=replace(LeadType, LeadVendor == "Organic" & LeadType == "Paid", "Paid Other"),
            LeadVendor=replace(LeadVendor, LeadVendor== "facebook" | LeadVendor== "Facebook" | 
                                 LeadVendor== "instagram" |LeadVendor== "Instagram"|
                                 LeadVendor == "Twitter" |LeadVendor== "Linkedin", "Paid Social"),
         LeadVendor=replace(LeadVendor, LeadVendor== "gmail"|LeadVendor == "Gmail" | 
                              LeadVendor == "Outlook" | LeadVendor == "Yahoo", "Email"),
         LeadType=replace(LeadType, LeadVendor== "Email", "Email"),         
         LeadVendor = replace(LeadVendor, LeadType == "Email", "Email"), 
         LeadVendor = replace(LeadVendor, is.na(LeadVendor), "Unspecified"),
         LeadType = replace(LeadType, is.na(LeadType), 'Unspecified'))

#Remove Unspecified non paid channels.
#agent_lead <- subset(agent_lead, LeadVendor != "Unspecified") 

agent_lead
## # A tibble: 271,890 x 6
##    REAgentID SalesRepID LeadPlatform LeadType LeadVendor
##    <chr>     <chr>      <chr>        <chr>    <chr>     
##  1 00340000… <NA>       Web          Unspeci… Unspecifi…
##  2 00333000… 005330000… Web          Unspeci… Unspecifi…
##  3 00333000… 005330000… Web          Email    Email     
##  4 00340000… <NA>       Web          Unspeci… Unspecifi…
##  5 00340000… <NA>       Web          Unspeci… Unspecifi…
##  6 00333000… 005400000… Web          Unspeci… Unspecifi…
##  7 00333000… 005330000… Web          Unspeci… Unspecifi…
##  8 00340000… <NA>       Web          Unspeci… Unspecifi…
##  9 00340000… <NA>       Web          Unspeci… Unspecifi…
## 10 00340000… <NA>       Web          Unspeci… Unspecifi…
## # … with 271,880 more rows, and 1 more variable: LeadDateTime <dttm>

3.4 Create an aggregate table for all channels

#Create a Marketing Channel aggregate dataframe
Channel_Data <- agent_lead %>% 
  group_by(LeadVendor) %>% 
  summarise(Total = n())

Channel_Data$Cost <- c(80, 10, 100, 0, 110, 150, 0)
Channel_Data$Total_Cost <- Channel_Data$Total*Channel_Data$Cost
Channel_Data$NPV <- c(7000, 8000, 7000, 7500, 7000, 7000, 7500)

#Create a Sales Channel calls aggregate dataframe
Channel_Calls <- phone_call %>% 
  filter(TalkTimeMinutes >= 0.5) %>%
  group_by(PhoneCallType) %>% 
  summarise(Total = n())

Channel_Calls <- Channel_Calls %>% dplyr::rename(LeadVendor = PhoneCallType)
Channel_Calls$Cost <- c(10, 10)
Channel_Calls$Total_Cost <- Channel_Calls$Total*Channel_Calls$Cost
Channel_Calls$NPV <- c(7500, 7500)

#Create a Sales Channel meetings aggregate dataframe
Channel_Meet <- sales_meeting %>% 
  summarise(Total = n())

Channel_Meet$Cost <- c(300)
Channel_Meet$Total_Cost <- Channel_Meet$Total*Channel_Meet$Cost
Channel_Meet$NPV <- c(7500)
Channel_Meet$LeadVendor <- "Meeting"

#Bind into an aggregate channels dataframe that will be useful later.
Channel_Data <- bind_rows(Channel_Data, Channel_Calls, Channel_Meet)
Channel_Data <- Channel_Data %>% 
  arrange(LeadVendor) %>%
  dplyr::rename(channel_name = LeadVendor)
Channel_Data
## # A tibble: 10 x 5
##    channel_name                Total  Cost Total_Cost   NPV
##    <chr>                       <int> <dbl>      <dbl> <dbl>
##  1 Bing Search                  2489    80     199120  7000
##  2 Email                       18403    10     184030  8000
##  3 Google Search               16112   100    1611200  7000
##  4 Meeting                      1003   300     300900  7500
##  5 Organic                    148132     0          0  7500
##  6 Paid Other                   7115   110     782650  7000
##  7 Paid Social                 12806   150    1920900  7000
##  8 RE Agent Called Sales Rep 4883311    10   48833110  7500
##  9 Sales Rep Called RE Agent 2411389    10   24113890  7500
## 10 Unspecified                 66833     0          0  7500

3.5 Join Data

#Left join agentid_zuid AND agent_acquisition_date ON zuid 
agentid_zuid_acquisitionDate <- left_join(agentid_zuid, agent_acquisition_date, by = "ZUID")

#Remove all ids without an acquisition date
agentid_zuid_acquisitionDate <- subset(agentid_zuid_acquisitionDate, !is.na(AcquisitionDate || !is.na(REAgentID)))
#Joing meeting with acquisition date
meeting_with_zuid <- sales_meeting %>% left_join(agentid_zuid_acquisitionDate, by = "REAgentID")
meeting_with_zuid <- meeting_with_zuid %>% filter(AcquisitionDate >= SalesMeetingDate | is.na(AcquisitionDate))
meeting_with_zuid
## # A tibble: 878 x 5
##    REAgentID    SalesRepID   SalesMeetingDate      ZUID AcquisitionDate    
##    <chr>        <chr>        <dttm>               <dbl> <dttm>             
##  1 0033300001f… 00533000002… 2017-03-13 00:00:00 3.81e6 NA                 
##  2 0034000001A… 00533000002… 2017-03-13 00:00:00 3.71e7 NA                 
##  3 0034000001A… 00540000001… 2017-03-21 00:00:00 1.30e7 NA                 
##  4 0034000001Q… 00540000001… 2017-03-21 00:00:00 2.09e7 NA                 
##  5 0034000001B… 00540000001… 2017-03-21 00:00:00 2.74e7 NA                 
##  6 0033300001l… 00540000001… 2017-03-21 00:00:00 7.03e6 NA                 
##  7 0034000000k… 00540000001… 2017-03-21 00:00:00 3.75e6 NA                 
##  8 0034000000g… 00540000001… 2017-03-21 00:00:00 2.24e6 NA                 
##  9 0034000000Q… 00540000001… 2017-03-22 00:00:00 1.31e6 NA                 
## 10 0033300001p… 00540000001… 2017-03-22 00:00:00 6.58e7 NA                 
## # … with 868 more rows
#Filter call data to calls over 2 minutes that have agent ids
phone_call<-subset(phone_call, phone_call$TalkTimeMinutes >= 2 & !is.na(REAgentID))

#Joing meeting with acquisition date
phoneCall_with_zuid <- phone_call %>% left_join(agentid_zuid_acquisitionDate, by = "REAgentID")
phoneCall_with_zuid <- subset(phoneCall_with_zuid, PhoneCallDateTime < AcquisitionDate | is.na(AcquisitionDate) | !is.na(REAgentID))
phoneCall_with_zuid
## # A tibble: 531,956 x 7
##    REAgentID SalesRepID PhoneCallType PhoneCallDateTime   TalkTimeMinutes
##    <chr>     <chr>      <chr>         <dttm>                        <dbl>
##  1 00333000… 005330000… Sales Rep Ca… 2016-10-01 08:52:21             4.2
##  2 00340000… 005330000… RE Agent Cal… 2016-10-01 09:28:21             8.5
##  3 00340000… 005330000… RE Agent Cal… 2016-10-01 10:14:55             3.3
##  4 00340000… 005330000… Sales Rep Ca… 2016-10-01 10:28:51            11.9
##  5 00340000… 005330000… Sales Rep Ca… 2016-10-01 10:50:57             2.1
##  6 00340000… 005330000… RE Agent Cal… 2016-10-01 11:01:17             2.2
##  7 00340000… 005330000… Sales Rep Ca… 2016-10-01 11:32:49             2.1
##  8 00340000… 005330000… Sales Rep Ca… 2016-10-01 11:42:17             2.4
##  9 00333000… 005330000… Sales Rep Ca… 2016-10-01 11:58:27             2.2
## 10 00340000… 005400000… Sales Rep Ca… 2016-10-01 12:43:06             3.8
## # … with 531,946 more rows, and 2 more variables: ZUID <dbl>,
## #   AcquisitionDate <dttm>
#Joing leads with acquisition date
lead_with_zuid <- left_join(agent_lead, agentid_zuid_acquisitionDate, by = "REAgentID")
lead_with_zuid <- subset(lead_with_zuid, LeadDateTime < AcquisitionDate || is.na(AcquisitionDate) || !is.na(REAgentID))
lead_with_zuid
## # A tibble: 271,890 x 8
##    REAgentID SalesRepID LeadPlatform LeadType LeadVendor
##    <chr>     <chr>      <chr>        <chr>    <chr>     
##  1 00340000… <NA>       Web          Unspeci… Unspecifi…
##  2 00333000… 005330000… Web          Unspeci… Unspecifi…
##  3 00333000… 005330000… Web          Email    Email     
##  4 00340000… <NA>       Web          Unspeci… Unspecifi…
##  5 00340000… <NA>       Web          Unspeci… Unspecifi…
##  6 00333000… 005400000… Web          Unspeci… Unspecifi…
##  7 00333000… 005330000… Web          Unspeci… Unspecifi…
##  8 00340000… <NA>       Web          Unspeci… Unspecifi…
##  9 00340000… <NA>       Web          Unspeci… Unspecifi…
## 10 00340000… <NA>       Web          Unspeci… Unspecifi…
## # … with 271,880 more rows, and 3 more variables: LeadDateTime <dttm>,
## #   ZUID <dbl>, AcquisitionDate <dttm>
#Creating dataframes that can be bound together
meeting <- meeting_with_zuid %>%
  select(REAgentID, SalesMeetingDate, AcquisitionDate) %>%
  dplyr::rename(Date = SalesMeetingDate)

phone <- phoneCall_with_zuid %>%
  select(REAgentID, PhoneCallType, PhoneCallDateTime, AcquisitionDate) %>%
  dplyr::rename(Channel = PhoneCallType, Date = PhoneCallDateTime)

leads <- lead_with_zuid %>%
  select(REAgentID, LeadVendor, LeadDateTime, AcquisitionDate) %>%
  dplyr::rename(Channel = LeadVendor, Date = LeadDateTime)

master <- bind_rows(leads, phone, meeting)

#Replace NA in Channel column
master$Channel[is.na(master$Channel)] <- "Meeting"

master <- master %>% filter(!is.na(REAgentID)) %>%
  mutate(Conversion = ifelse(is.na(AcquisitionDate), 0, 1))

  
master
## # A tibble: 789,273 x 5
##    REAgentID    Channel  Date                AcquisitionDate     Conversion
##    <chr>        <chr>    <dttm>              <dttm>                   <dbl>
##  1 0034000001S… Unspeci… 2016-10-01 00:00:49 NA                           0
##  2 0033300001p… Unspeci… 2016-10-01 00:02:06 NA                           0
##  3 0033300001p… Email    2016-10-01 00:02:56 NA                           0
##  4 0034000001S… Unspeci… 2016-10-01 00:05:39 NA                           0
##  5 0034000001S… Unspeci… 2016-10-01 00:08:12 NA                           0
##  6 0033300001o… Unspeci… 2016-10-01 00:10:59 NA                           0
##  7 0033300001h… Unspeci… 2016-10-01 00:14:00 NA                           0
##  8 0034000001S… Unspeci… 2016-10-01 00:14:32 NA                           0
##  9 0034000001S… Unspeci… 2016-10-01 00:18:20 NA                           0
## 10 0034000001S… Unspeci… 2016-10-01 00:20:27 NA                           0
## # … with 789,263 more rows

4 EDA

4.1 Customer journey to conversion statistics

#Add the average touches and average time
stats <- master %>% 
  filter(Conversion == 1) %>%
  arrange(REAgentID, Date) %>%
  group_by(REAgentID) %>% 
  summarise(Touch = n(), Days = (max(Date) - min(Date))) 
  
stats %>% summarise(Average_Touch = mean(Touch),
                    Max_Touch = max(Touch),
                    Average_Days = mean(Days),
                    Max_Days = max(Days)) 
## # A tibble: 1 x 4
##   Average_Touch Max_Touch Average_Days Max_Days     
##           <dbl>     <int> <drtn>       <drtn>       
## 1          5.26        46 85.0919 days 269.6749 days
#Distribution plot for days to conversion.
ggplot(stats %>% filter(Days > 0), aes(x = Days)) +
        theme_minimal() +
        geom_histogram(fill = '#4e79a7', binwidth = 7)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#Distribution plot for touches to conversion.
ggplot(stats, aes(x = Touch)) +
        theme_minimal() +
        geom_histogram(fill = '#4e79a7', binwidth = 1)

5 Analysis

# aggregating channels to the paths for each customer
paths <- master %>%
        filter(!is.na(REAgentID)) %>%
        arrange(REAgentID, Date) %>%
        group_by(REAgentID) %>%
        summarise(path = paste(Channel, collapse = ' > '),
                  conv = mean(Conversion)) %>%
        ungroup()
paths
## # A tibble: 298,342 x 3
##    REAgentID        path                                               conv
##    <chr>            <chr>                                             <dbl>
##  1 0033000000Ea22X… Sales Rep Called RE Agent > Sales Rep Called RE …     0
##  2 0033000000EOCWG… Email                                                 0
##  3 0033000000EqCu4… Unspecified > Unspecified > Unspecified > Unspec…     0
##  4 0033000000EqCv6… Unspecified                                           0
##  5 0033000000EqCw2… Sales Rep Called RE Agent > Sales Rep Called RE …     0
##  6 0033000000EqD1S… Organic > Sales Rep Called RE Agent > Sales Rep …     1
##  7 0033000000EqD1Y… Sales Rep Called RE Agent > RE Agent Called Sale…     0
##  8 0033000000FhHV9… Sales Rep Called RE Agent                             0
##  9 0033000000GBnTr… Sales Rep Called RE Agent                             0
## 10 0033000000GBrMA… RE Agent Called Sales Rep > Sales Rep Called RE …     0
## # … with 298,332 more rows

5.1 Build the smarketing attribution models

#Calculating the Markov model
markov <- markov_model(paths,
                    var_path = 'path',
                    var_conv = 'conv',
                    out_more = TRUE)

#Calculating the other models
h_mod <- heuristic_models(paths, 
                           var_path = 'path', 
                           var_conv = 'conv')

#Merges the two data frames on the "channel_name" column.
results <- merge(h_mod, markov$result, by='channel_name') 

#Rename the columns
colnames(results) <- c('channel_name', 'first_touch', 'last_touch', 'linear_touch', 'markov_model') 

results
##                 channel_name first_touch last_touch linear_touch
## 1                Bing Search         196         22    72.233785
## 2                      Email         858        227   414.892935
## 3              Google Search        1300        135   439.074995
## 4                    Meeting           4          3     3.916667
## 5                    Organic        6787       2115  3650.616441
## 6                 Paid Other         380         42   135.014880
## 7                Paid Social         684         65   210.588133
## 8  RE Agent Called Sales Rep        1762       3686  3075.819770
## 9  Sales Rep Called RE Agent        4088       9642  7996.318503
## 10               Unspecified         376        498   436.523892
##    markov_model
## 1    103.421530
## 2    680.804275
## 3    690.893664
## 4      3.035995
## 5   3996.760609
## 6    199.994420
## 7    346.830681
## 8   3925.188787
## 9   5748.776963
## 10   739.293076
#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results_graph <- melt(results, id='channel_name')
# Plot the total conversions
ggplot(results_graph, aes(x = reorder(channel_name, value), y = value, fill =variable)) +
  geom_bar(stat='identity', position='dodge') +
  ggtitle('Total Conversions') + 
  theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank()) +
  coord_flip() +
  theme_classic() +
  scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
  theme(axis.text = element_text(face = "bold", size = 10),
      axis.title = element_blank(),
      axis.ticks.x = element_blank(),
      legend.title = element_text(face = "bold"),
      legend.position = c(0.9, 0.2),
      plot.title = element_text(hjust=0.5, face ="bold"),
      plot.subtitle = element_text(hjust = 0.5)) 

5.1.1 Results for last touch attribution.

#Listing the results for the last touch attribution for smarketing
results %>% select(channel_name, last_touch) %>%
  mutate(conversion_rate = last_touch/Channel_Data$Total) %>%
  mutate(CPA = Channel_Data$Total_Cost/last_touch) %>%
  mutate(ROI = ((Channel_Data$NPV*last_touch) / Channel_Data$Total_Cost))
##                 channel_name last_touch conversion_rate         CPA
## 1                Bing Search         22    0.0088388911   9050.9091
## 2                      Email        227    0.0123349454    810.7048
## 3              Google Search        135    0.0083788481  11934.8148
## 4                    Meeting          3    0.0029910269 100300.0000
## 5                    Organic       2115    0.0142778063      0.0000
## 6                 Paid Other         42    0.0059030218  18634.5238
## 7                Paid Social         65    0.0050757457  29552.3077
## 8  RE Agent Called Sales Rep       3686    0.0007548157  13248.2664
## 9  Sales Rep Called RE Agent       9642    0.0039985253   2500.9220
## 10               Unspecified        498    0.0074514087      0.0000
##           ROI
## 1  0.77340297
## 2  9.86795631
## 3  0.58651936
## 4  0.07477567
## 5         Inf
## 6  0.37564684
## 7  0.23686813
## 8  0.56611180
## 9  2.99889400
## 10        Inf

5.1.2 Results for first touch attribution.

#Listing the results for the first touch attribution for smarketing
results %>% select(channel_name, first_touch) %>%
  mutate(conversion_rate = first_touch/Channel_Data$Total) %>%
  mutate(CPA = Channel_Data$Total_Cost/first_touch) %>%
  mutate(ROI = ((Channel_Data$NPV*first_touch) / Channel_Data$Total_Cost))
##                 channel_name first_touch conversion_rate        CPA
## 1                Bing Search         196    0.0787464845  1015.9184
## 2                      Email         858    0.0466228332   214.4872
## 3              Google Search        1300    0.0806852036  1239.3846
## 4                    Meeting           4    0.0039880359 75225.0000
## 5                    Organic        6787    0.0458172441     0.0000
## 6                 Paid Other         380    0.0534082923  2059.6053
## 7                Paid Social         684    0.0534124629  2808.3333
## 8  RE Agent Called Sales Rep        1762    0.0003608208 27714.5914
## 9  Sales Rep Called RE Agent        4088    0.0016952885  5898.7011
## 10               Unspecified         376    0.0056259632     0.0000
##           ROI
## 1   6.8903174
## 2  37.2982666
## 3   5.6479643
## 4   0.0997009
## 5         Inf
## 6   3.3987095
## 7   2.4925816
## 8   0.2706156
## 9   1.2714664
## 10        Inf

5.1.3 Results for linear attribution

#Listing the results for the linear attribution model for smarketing
results %>% select(channel_name, linear_touch) %>%
  mutate(conversion_rate = linear_touch/Channel_Data$Total) %>%
  mutate(CPA = Channel_Data$Total_Cost/linear_touch) %>%
  mutate(ROI = ((Channel_Data$NPV*linear_touch) / Channel_Data$Total_Cost))
##                 channel_name linear_touch conversion_rate        CPA
## 1                Bing Search    72.233785    0.0290212072  2756.6048
## 2                      Email   414.892935    0.0225448533   443.5602
## 3              Google Search   439.074995    0.0272514272  3669.5326
## 4                    Meeting     3.916667    0.0039049518 76825.5319
## 5                    Organic  3650.616441    0.0246443472     0.0000
## 6                 Paid Other   135.014880    0.0189760899  5796.7685
## 7                Paid Social   210.588133    0.0164444895  9121.5966
## 8  RE Agent Called Sales Rep  3075.819770    0.0006298636 15876.4536
## 9  Sales Rep Called RE Agent  7996.318503    0.0033160633  3015.6240
## 10               Unspecified   436.523892    0.0065315621     0.0000
##           ROI
## 1   2.5393556
## 2  18.0358826
## 3   1.9075999
## 4   0.0976238
## 5         Inf
## 6   1.2075694
## 7   0.7674095
## 8   0.4723977
## 9   2.4870475
## 10        Inf

5.1.4 Results for the markov chain model

#Listing the results for the markov chain attribution for smarketing
results %>% select(channel_name, markov_model) %>%
  mutate(conversion_rate = markov_model/Channel_Data$Total) %>%
  mutate(CPA = Channel_Data$Total_Cost/markov_model) %>%
  mutate(ROI = ((Channel_Data$NPV*markov_model) / Channel_Data$Total_Cost))
##                 channel_name markov_model conversion_rate        CPA
## 1                Bing Search   103.421530    0.0415514383  1925.3244
## 2                      Email   680.804275    0.0369942007   270.3126
## 3              Google Search   690.893664    0.0428806892  2332.0521
## 4                    Meeting     3.035995    0.0030269145 99110.8282
## 5                    Organic  3996.760609    0.0269810750     0.0000
## 6                 Paid Other   199.994420    0.0281088432  3913.3592
## 7                Paid Social   346.830681    0.0270834516  5538.4374
## 8  RE Agent Called Sales Rep  3925.188787    0.0008037966 12440.9583
## 9  Sales Rep Called RE Agent  5748.776963    0.0023840106  4194.6122
## 10               Unspecified   739.293076    0.0110617970     0.0000
##            ROI
## 1   3.63575085
## 2  29.59536055
## 3   3.00164824
## 4   0.07567286
## 5          Inf
## 6   1.78874457
## 7   1.26389441
## 8   0.60284745
## 9   1.78800796
## 10         Inf

5.1.5 ROI for each attribution strategy

results_ROI <- results %>% 
  mutate(first_ROI = (Channel_Data$NPV*first_touch) / Channel_Data$Total_Cost) %>%
  mutate(last_ROI = (Channel_Data$NPV*last_touch) / Channel_Data$Total_Cost) %>%
  mutate(linear_ROI = (Channel_Data$NPV*linear_touch) / Channel_Data$Total_Cost) %>%
  mutate(markov_ROI = (Channel_Data$NPV*markov_model) / Channel_Data$Total_Cost) %>%
  select(channel_name, first_ROI, last_ROI, linear_ROI, markov_ROI) %>%
  filter(channel_name != "Organic" & channel_name != "Unspecified")

results_ROI
##                channel_name  first_ROI   last_ROI linear_ROI  markov_ROI
## 1               Bing Search  6.8903174 0.77340297  2.5393556  3.63575085
## 2                     Email 37.2982666 9.86795631 18.0358826 29.59536055
## 3             Google Search  5.6479643 0.58651936  1.9075999  3.00164824
## 4                   Meeting  0.0997009 0.07477567  0.0976238  0.07567286
## 5                Paid Other  3.3987095 0.37564684  1.2075694  1.78874457
## 6               Paid Social  2.4925816 0.23686813  0.7674095  1.26389441
## 7 RE Agent Called Sales Rep  0.2706156 0.56611180  0.4723977  0.60284745
## 8 Sales Rep Called RE Agent  1.2714664 2.99889400  2.4870475  1.78800796

5.1.6 Graph ROI accross channels

#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results_ROI <- melt(results_ROI, id='channel_name')

# Plot the ROIs
ggplot(results_ROI, aes(x = reorder(channel_name, value), y = value, fill =variable)) +
  geom_bar(stat='identity', position='dodge') +
  ggtitle('ROI Per Channel') + 
  theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank()) +
  coord_flip() +
  theme_classic() +
  scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
  theme(axis.text = element_text(face = "bold", size = 10),
      axis.title = element_blank(),
      axis.ticks.x = element_blank(),
      legend.title = element_text(face = "bold"),
      legend.position = c(0.9, 0.2),
      plot.title = element_text(hjust=0.5, face ="bold"),
      plot.subtitle = element_text(hjust = 0.5)) 

5.2 Build the attribution models for marketing channels only

marketing_master <- master %>% filter(Channel != "Meeting" & Channel != "RE Agent Called Sales Rep" & Channel != "Sales Rep Called RE Agent")

# aggregating channels to the paths for each customer
paths2 <- marketing_master %>%
        arrange(REAgentID, Date) %>%
        group_by(REAgentID) %>%
        summarise(path = paste(Channel, collapse = ' > '),
                  conv = mean(Conversion)) %>%
        ungroup()
#Calculating the Markov model
markov <- markov_model(paths2,
                    var_path = 'path',
                    var_conv = 'conv',
                    out_more = TRUE)

#Calculating the other models
h_mod <- heuristic_models(paths2, 
                           var_path = 'path', 
                           var_conv = 'conv')

#Merges the two data frames on the "channel_name" column.
results2 <- merge(h_mod, markov$result, by='channel_name') 

#Rename the columns
colnames(results2) <- c('channel_name', 'first_touch', 'last_touch', 'linear_touch', 'markov_model') 

results2
##    channel_name first_touch last_touch linear_touch markov_model
## 1   Bing Search         208        139     168.6668     191.7925
## 2         Email        1028       1011    1014.3919    1221.9004
## 3 Google Search        1339        908    1109.8847    1245.7677
## 4       Organic        7946       8299    8185.9571    7349.3587
## 5    Paid Other         386        279     325.9168     364.4154
## 6   Paid Social         698        431     542.7235     620.3554
## 7   Unspecified         736       1274     993.4594    1347.4099
#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results2_graph <- melt(results2, id='channel_name')

5.2.1 Results for last touch attribution

m_channel <- Channel_Data %>% filter(channel_name != "Meeting" & channel_name != "Sales Rep Called RE Agent" & channel_name != "RE Agent Called Sales Rep")

results2 %>% select(channel_name, last_touch) %>%
  mutate(conversion_rate = last_touch/m_channel$Total) %>%
  mutate(CPA = m_channel$Total_Cost/last_touch) %>%
  mutate(ROI = (m_channel$NPV*last_touch) / m_channel$Total_Cost)
##    channel_name last_touch conversion_rate       CPA       ROI
## 1   Bing Search        139      0.05584572 1432.5180  4.886501
## 2         Email       1011      0.05493670  182.0277 43.949356
## 3 Google Search        908      0.05635551 1774.4493  3.944886
## 4       Organic       8299      0.05602436    0.0000       Inf
## 5    Paid Other        279      0.03921293 2805.1971  2.495368
## 6   Paid Social        431      0.03365610 4456.8445  1.570618
## 7   Unspecified       1274      0.01906244    0.0000       Inf

5.2.2 Results for markov chain attribution

results2 %>% select(channel_name, markov_model) %>%
  mutate(conversion_rate = markov_model/m_channel$Total) %>%
  mutate(CPA = m_channel$Total_Cost/markov_model) %>%
  mutate(ROI = (m_channel$NPV*markov_model) / m_channel$Total_Cost)
##    channel_name markov_model conversion_rate       CPA       ROI
## 1   Bing Search     191.7925      0.07705603 1038.2056  6.742402
## 2         Email    1221.9004      0.06639681  150.6097 53.117444
## 3 Google Search    1245.7677      0.07731925 1293.3390  5.412347
## 4       Organic    7349.3587      0.04961358    0.0000       Inf
## 5    Paid Other     364.4154      0.05121790 2147.6866  3.259321
## 6   Paid Social     620.3554      0.04844256 3096.4508  2.260653
## 7   Unspecified    1347.4099      0.02016085    0.0000       Inf
# Plot the total conversions
ggplot(results2_graph, aes(channel_name, value, fill = variable)) +
  geom_bar(stat='identity', position='dodge') +
  ggtitle('Total Conversions') + 
  theme(panel.grid.major = element_blank(),
      panel.grid.minor = element_blank()) +
  theme_classic() +
  scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
  theme(axis.text = element_text(face = "bold", size = 10),
      axis.title = element_blank(),
      axis.ticks.x = element_blank(),
      legend.title = element_text(face = "bold"),
      legend.position = c(0.9, 0.8),
      plot.title = element_text(hjust=0.5, face ="bold"),
      plot.subtitle = element_text(hjust = 0.5)) 

6 Visualize Path with Sankey diagram

# Construct conversions sequences for all visitors
master1 = master %>%
  group_by(REAgentID) %>%
  arrange(Date) %>%
  mutate(order_seq = ifelse(Conversion > 0, 1, NA)) %>%
  mutate(order_seq = lag(cumsum(ifelse(is.na(order_seq), 0, order_seq)))) %>%
  mutate(order_seq = ifelse((row_number() == 1) & (Conversion > 0), 
    -1, ifelse(row_number() == 1, 0, order_seq))) %>% 
  ungroup()

# Create a modified channel stacks data frame
channel_stacks = master1 %>%
  group_by(REAgentID, order_seq) %>%
  
  #first remove irrelevant hits:
  filter(!is.na(Channel) | Conversion>0) %>%
  
  #next remove repeated values with a lag function:
  filter((Channel != lag(Channel, default="1")) | Conversion>0) %>%
  
  #now concatenate the sequence into a single row:
  summarize(
    path = paste(Channel[which(!is.na(Channel))], collapse=">"),
    
    # for Spark SQL or PostgreSQL:
    # path = concat_ws(" > ", collect_list(Channel))
    
    Conversion = sum(Conversion)
  ) %>% ungroup() %>%
  
  #next roll up each unique path by count and conversion:
  group_by(path) %>%
  summarize(
    Conversion = sum(Conversion),
    path_count = n()
  ) %>% ungroup() %>%
  
  #last create a conversion rate column and pull it out of Spark:
  mutate(
    conversion_rate = Conversion/path_count
  ) %>%
  filter(path != "") %>%
  collect()


# Visualizing customer paths with a Sankey Diagram
# Creating a list of channels for convinience
channel_stacks$path_list = strsplit(x=channel_stacks$path,split=">")
# set the depth of the Sankey Diagram
depth = 7
#Generate node labels and label length vectors
node_labels=rep(list(list()),depth)
label_length = list()
for(i in 1:depth){
  for(j in 1:length(channel_stacks$path)){
    if(!is.na(channel_stacks$path_list[j][[1]][i]))
      node_labels[[i]][j] = channel_stacks$path_list[j][[1]][i]
  }
  node_labels[[i]] = unique(unlist(node_labels[[i]]))
  node_labels[[i]] = node_labels[[i]][order(node_labels[[i]])]
  label_length[[i]] = length(node_labels[[i]])
}
node_labels = unlist(node_labels)
label_length = unlist(label_length)


# Build a data frame to fill out with each path view
combos = NULL
for(i in 1:(depth-1)){
  for(j in (1 + sum(label_length[1:i-1])):(label_length[i] + sum(label_length[1:i-1]))){
    for(k in (1 + label_length[i] + sum(label_length[1:i-1])):(label_length[i+1] + label_length[i] + sum(label_length[1:i-1]))){
      combos = rbind(combos, c(i,j,k,0))
    } 
  }
}
combos = as.data.frame(combos)
names(combos) = c("step","source","target","value")
#Populate the combo table
for(i in 1:(dim(combos)[1])){
  for(j in 1:(dim(channel_stacks)[1])){
    combos$value[i] = sum(combos$value[i], ifelse(
      (node_labels[combos$source[i]] == channel_stacks$path_list[j][[1]][combos$step[i]]) &
      (node_labels[combos$target[i]] == channel_stacks$path_list[j][[1]][combos$step[i]+1]),
      channel_stacks$path_count[j],0), na.rm = TRUE)
  }
}


#Add a node to populate with conversion values
uniques = unique(c(combos$source,combos$target))
converts = as.data.frame(list("step"=rep(0,length(uniques)), "source"=uniques, "target"=rep(max(uniques)+1,length(uniques)), "value"=rep(0,length(uniques))))
combos = rbind(combos,converts)
for(i in 1:(dim(channel_stacks)[1])){
  stack_depth = min(depth,length(channel_stacks$path_list[i][[1]]))
  index_val = which(combos$step==0 & combos$source==(which(node_labels == channel_stacks$path_list[i][[1]][stack_depth]) + ifelse(stack_depth>1, sum(label_length[1:(stack_depth-1)]),0)))
  combos$value[index_val] = combos$value[index_val] + channel_stacks$Conversion[i]
}
#Populate the conversion node values
display_node_labels = node_labels
for(i in 1:length(label_length)){
  for(j in 1:label_length[i]){
    display_node_labels[j+ifelse(i==1,0,sum(label_length[1:(i-1)]))] = paste0(i,":",node_labels[j+ifelse(i==1,0,sum(label_length[1:(i-1)]))])
  }
}
display_node_labels = c(display_node_labels, "Conversion")

#Generate Sankey diagram
p <- plot_ly(
    type = "sankey",
    orientation = "v",
    node = list(
      label = display_node_labels,
      #color = node_colors,
      pad = 10,
      thickness = 30,
      line = list(
        color = "white",
        width = 0
      )
    ),
  
    link = list(
      source = combos$source-1, # convert to zero index
      target = combos$target-1, # convert to zero index
      value = combos$value, #size of connection
      color = "rgba(0, 0, 0, 0.2)"
    )
  ) %>% 
  layout(
    title = "Conversion Flow Diagram",
    font = list(
    size = 10
    )
  )
p